uumerrr684 commited on
Commit
23139be
Β·
1 Parent(s): 3d5b1f6

πŸš€ ChunkiFy Phase 1: Initial app upload

Browse files
Files changed (5) hide show
  1. README copy.md +39 -0
  2. app.py +39 -0
  3. chunker.py +51 -0
  4. pdf_reader.py +22 -0
  5. requirements.txt +2 -0
README copy.md ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🧩 ChunkiFy – Smart PDF Text Chunking App
2
+
3
+ Slice long PDFs into clean, overlapping chunks.
4
+ Built for reading clarity, NLP projects, and AI pipelines.
5
+
6
+ ## πŸš€ What ChunkiFy Does
7
+
8
+ - πŸ“„ Upload any PDF
9
+ - 🧠 Extracts clean text using PyMuPDF
10
+ - πŸ” Splits into sentence-paragraphs
11
+ - 🧩 Merges smartly into ~200-word chunks with 50-word overlap
12
+ - βœ… Outputs clean, readable, and consistent content blocks
13
+
14
+ ## πŸ–₯️ Built With
15
+
16
+ - Python
17
+ - PyMuPDF (`fitz`)
18
+ - Gradio
19
+
20
+ ## πŸ” Use Cases
21
+
22
+ - AI/NLP text preprocessing
23
+ - Book summary readers
24
+ - Embedding pipelines (Phase 2)
25
+ - Focused learning from dense PDFs
26
+
27
+ ## 🏁 Phase 1: Chunking Milestone
28
+
29
+ This is Phase 1 of a larger AI project.
30
+ Upcoming phases: vector embeddings, semantic search, PDF Q&A chatbot.
31
+
32
+ ## πŸ‘¨β€πŸ’» Made by Umer
33
+
34
+ A personal learning sprint turned into a useful microtool.
35
+ [Gradio App Live](link) | [GitHub](link)
36
+
37
+ ---
38
+
39
+ > ChunkiFy is not just a toy. It’s a foundational block in building smarter AI tools.
app.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import gradio as gr
3
+ from pdf_reader import extract_text_from_pdf
4
+ from chunker import chunk_text
5
+
6
+
7
+ def process_pdf(file):
8
+ # Save uploaded file to disk
9
+ pdf_path = file.name
10
+
11
+ # Extract text
12
+ text = extract_text_from_pdf(pdf_path)
13
+
14
+ # Chunk text
15
+ chunks = chunk_text(text)
16
+
17
+ # Ensure we handle tuple return (chunks, extra) if needed
18
+ if isinstance(chunks, tuple):
19
+ chunks = chunks[0]
20
+
21
+ # Format chunks nicely for display
22
+ formatted = ""
23
+ for i, chunk in enumerate(chunks):
24
+ formatted += f"--- Chunk {i+1} ---\n{chunk.strip()}\n\n"
25
+
26
+ return formatted.strip()
27
+
28
+
29
+ demo = gr.Interface(
30
+ fn=process_pdf,
31
+ inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
32
+ outputs=gr.Textbox(label="Chunked Output",
33
+ lines=30, show_copy_button=True),
34
+ title="🧩 ChunkiFy – Smart PDF Text Chunking",
35
+ description="Slice long PDFs into clean, overlapping chunks using PyMuPDF + Gradio.",
36
+ )
37
+
38
+ if __name__ == "__main__":
39
+ demo.launch()
chunker.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def chunk_text(text, max_words=200, overlap=50):
2
+ # Step 1: Clean the text (remove broken line breaks)
3
+ clean_text = text.replace('\n', ' ').replace(' ', ' ')
4
+
5
+ # Step 2: Split into sentence-like paragraphs
6
+ paragraphs = clean_text.split('. ')
7
+ print(f"Total sentence-paragraphs found: {len(paragraphs)}")
8
+
9
+ # Calculate total word count
10
+ total_words = sum(len(p.split()) for p in paragraphs)
11
+ print(f"Total word count: {total_words}")
12
+
13
+ # For short PDFs, return original sentence-paragraphs
14
+ if total_words <= 500:
15
+ print("Short PDF β€” returning sentence-paragraphs without merging.")
16
+ return paragraphs
17
+
18
+ print("Long PDF β€” using merged 200-word chunks with 50-word overlap.")
19
+
20
+ # Merging logic starts here
21
+ chunks = []
22
+ current_chunk = []
23
+ current_word_count = 0
24
+
25
+ for para in paragraphs:
26
+ words = para.split()
27
+ word_count = len(words)
28
+
29
+ if current_word_count + word_count <= max_words:
30
+ current_chunk.append(para)
31
+ current_word_count += word_count
32
+ else:
33
+ # Save current chunk
34
+ chunk_text = '. '.join(current_chunk).strip()
35
+ chunks.append(chunk_text)
36
+
37
+ # Prepare next chunk with overlap
38
+ all_words = chunk_text.split()
39
+ overlap_words = all_words[-overlap:] if len(
40
+ all_words) >= overlap else all_words
41
+
42
+ current_chunk = [' '.join(overlap_words)]
43
+ current_chunk.append(para)
44
+ current_word_count = len(overlap_words) + word_count
45
+
46
+ # Save any leftover content
47
+ if current_chunk:
48
+ final_chunk = '. '.join(current_chunk).strip()
49
+ chunks.append(final_chunk)
50
+
51
+ return chunks
pdf_reader.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+
3
+
4
+ def extract_text_from_pdf(pdf_path):
5
+ doc = fitz.open(pdf_path)
6
+ full_text = ""
7
+
8
+ for i, page in enumerate(doc):
9
+ print(f"πŸ“„ Processing page {i + 1} of {doc.page_count}")
10
+ text = page.get_text()
11
+ full_text += text + "\n"
12
+
13
+ doc.close()
14
+ return full_text
15
+
16
+
17
+ # Test it here
18
+ if __name__ == "__main__":
19
+ pdf_path = "/Users/mac/Desktop/brief-summary-of-atomic-habits.pdf"
20
+ text = extract_text_from_pdf(pdf_path)
21
+ print(text[:1000]) # Preview first 1000 characters
22
+ print(f"βœ… Text extraction complete.")
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio
2
+ PyMuPDF