Commit Β·
23139be
1
Parent(s): 3d5b1f6
π ChunkiFy Phase 1: Initial app upload
Browse files- README copy.md +39 -0
- app.py +39 -0
- chunker.py +51 -0
- pdf_reader.py +22 -0
- requirements.txt +2 -0
README copy.md
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# π§© ChunkiFy β Smart PDF Text Chunking App
|
| 2 |
+
|
| 3 |
+
Slice long PDFs into clean, overlapping chunks.
|
| 4 |
+
Built for reading clarity, NLP projects, and AI pipelines.
|
| 5 |
+
|
| 6 |
+
## π What ChunkiFy Does
|
| 7 |
+
|
| 8 |
+
- π Upload any PDF
|
| 9 |
+
- π§ Extracts clean text using PyMuPDF
|
| 10 |
+
- π Splits into sentence-paragraphs
|
| 11 |
+
- π§© Merges smartly into ~200-word chunks with 50-word overlap
|
| 12 |
+
- β
Outputs clean, readable, and consistent content blocks
|
| 13 |
+
|
| 14 |
+
## π₯οΈ Built With
|
| 15 |
+
|
| 16 |
+
- Python
|
| 17 |
+
- PyMuPDF (`fitz`)
|
| 18 |
+
- Gradio
|
| 19 |
+
|
| 20 |
+
## π Use Cases
|
| 21 |
+
|
| 22 |
+
- AI/NLP text preprocessing
|
| 23 |
+
- Book summary readers
|
| 24 |
+
- Embedding pipelines (Phase 2)
|
| 25 |
+
- Focused learning from dense PDFs
|
| 26 |
+
|
| 27 |
+
## π Phase 1: Chunking Milestone
|
| 28 |
+
|
| 29 |
+
This is Phase 1 of a larger AI project.
|
| 30 |
+
Upcoming phases: vector embeddings, semantic search, PDF Q&A chatbot.
|
| 31 |
+
|
| 32 |
+
## π¨βπ» Made by Umer
|
| 33 |
+
|
| 34 |
+
A personal learning sprint turned into a useful microtool.
|
| 35 |
+
[Gradio App Live](link) | [GitHub](link)
|
| 36 |
+
|
| 37 |
+
---
|
| 38 |
+
|
| 39 |
+
> ChunkiFy is not just a toy. Itβs a foundational block in building smarter AI tools.
|
app.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app.py
|
| 2 |
+
import gradio as gr
|
| 3 |
+
from pdf_reader import extract_text_from_pdf
|
| 4 |
+
from chunker import chunk_text
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def process_pdf(file):
|
| 8 |
+
# Save uploaded file to disk
|
| 9 |
+
pdf_path = file.name
|
| 10 |
+
|
| 11 |
+
# Extract text
|
| 12 |
+
text = extract_text_from_pdf(pdf_path)
|
| 13 |
+
|
| 14 |
+
# Chunk text
|
| 15 |
+
chunks = chunk_text(text)
|
| 16 |
+
|
| 17 |
+
# Ensure we handle tuple return (chunks, extra) if needed
|
| 18 |
+
if isinstance(chunks, tuple):
|
| 19 |
+
chunks = chunks[0]
|
| 20 |
+
|
| 21 |
+
# Format chunks nicely for display
|
| 22 |
+
formatted = ""
|
| 23 |
+
for i, chunk in enumerate(chunks):
|
| 24 |
+
formatted += f"--- Chunk {i+1} ---\n{chunk.strip()}\n\n"
|
| 25 |
+
|
| 26 |
+
return formatted.strip()
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
demo = gr.Interface(
|
| 30 |
+
fn=process_pdf,
|
| 31 |
+
inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
|
| 32 |
+
outputs=gr.Textbox(label="Chunked Output",
|
| 33 |
+
lines=30, show_copy_button=True),
|
| 34 |
+
title="π§© ChunkiFy β Smart PDF Text Chunking",
|
| 35 |
+
description="Slice long PDFs into clean, overlapping chunks using PyMuPDF + Gradio.",
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
if __name__ == "__main__":
|
| 39 |
+
demo.launch()
|
chunker.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def chunk_text(text, max_words=200, overlap=50):
|
| 2 |
+
# Step 1: Clean the text (remove broken line breaks)
|
| 3 |
+
clean_text = text.replace('\n', ' ').replace(' ', ' ')
|
| 4 |
+
|
| 5 |
+
# Step 2: Split into sentence-like paragraphs
|
| 6 |
+
paragraphs = clean_text.split('. ')
|
| 7 |
+
print(f"Total sentence-paragraphs found: {len(paragraphs)}")
|
| 8 |
+
|
| 9 |
+
# Calculate total word count
|
| 10 |
+
total_words = sum(len(p.split()) for p in paragraphs)
|
| 11 |
+
print(f"Total word count: {total_words}")
|
| 12 |
+
|
| 13 |
+
# For short PDFs, return original sentence-paragraphs
|
| 14 |
+
if total_words <= 500:
|
| 15 |
+
print("Short PDF β returning sentence-paragraphs without merging.")
|
| 16 |
+
return paragraphs
|
| 17 |
+
|
| 18 |
+
print("Long PDF β using merged 200-word chunks with 50-word overlap.")
|
| 19 |
+
|
| 20 |
+
# Merging logic starts here
|
| 21 |
+
chunks = []
|
| 22 |
+
current_chunk = []
|
| 23 |
+
current_word_count = 0
|
| 24 |
+
|
| 25 |
+
for para in paragraphs:
|
| 26 |
+
words = para.split()
|
| 27 |
+
word_count = len(words)
|
| 28 |
+
|
| 29 |
+
if current_word_count + word_count <= max_words:
|
| 30 |
+
current_chunk.append(para)
|
| 31 |
+
current_word_count += word_count
|
| 32 |
+
else:
|
| 33 |
+
# Save current chunk
|
| 34 |
+
chunk_text = '. '.join(current_chunk).strip()
|
| 35 |
+
chunks.append(chunk_text)
|
| 36 |
+
|
| 37 |
+
# Prepare next chunk with overlap
|
| 38 |
+
all_words = chunk_text.split()
|
| 39 |
+
overlap_words = all_words[-overlap:] if len(
|
| 40 |
+
all_words) >= overlap else all_words
|
| 41 |
+
|
| 42 |
+
current_chunk = [' '.join(overlap_words)]
|
| 43 |
+
current_chunk.append(para)
|
| 44 |
+
current_word_count = len(overlap_words) + word_count
|
| 45 |
+
|
| 46 |
+
# Save any leftover content
|
| 47 |
+
if current_chunk:
|
| 48 |
+
final_chunk = '. '.join(current_chunk).strip()
|
| 49 |
+
chunks.append(final_chunk)
|
| 50 |
+
|
| 51 |
+
return chunks
|
pdf_reader.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz # PyMuPDF
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def extract_text_from_pdf(pdf_path):
|
| 5 |
+
doc = fitz.open(pdf_path)
|
| 6 |
+
full_text = ""
|
| 7 |
+
|
| 8 |
+
for i, page in enumerate(doc):
|
| 9 |
+
print(f"π Processing page {i + 1} of {doc.page_count}")
|
| 10 |
+
text = page.get_text()
|
| 11 |
+
full_text += text + "\n"
|
| 12 |
+
|
| 13 |
+
doc.close()
|
| 14 |
+
return full_text
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# Test it here
|
| 18 |
+
if __name__ == "__main__":
|
| 19 |
+
pdf_path = "/Users/mac/Desktop/brief-summary-of-atomic-habits.pdf"
|
| 20 |
+
text = extract_text_from_pdf(pdf_path)
|
| 21 |
+
print(text[:1000]) # Preview first 1000 characters
|
| 22 |
+
print(f"β
Text extraction complete.")
|
requirements.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
PyMuPDF
|