Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Running

App Files Files Community

Shubham170793 commited on Oct 5

Commit

6b0c8b8

verified ·

1 Parent(s): 6403c55

Update src/ingestion.py

Browse files

Files changed (1) hide show

src/ingestion.py +43 -34

src/ingestion.py CHANGED Viewed

@@ -2,81 +2,90 @@ import re
 import fitz  # PyMuPDF
 # -----------------------------
-# TEXT EXTRACTION
 # -----------------------------
 def extract_text_from_pdf(file_path: str) -> str:
     """
-    Extracts text from a PDF file using PyMuPDF.
     Args:
         file_path (str): Path to the PDF file.
     Returns:
-        str: The extracted text from the PDF.
     """
     text = ""
-    with fitz.open(file_path) as pdf:
-        for page in pdf:
-            text += page.get_text("text")  # Extracts text from each page
     return text
 # -----------------------------
-# SMART CHUNKING (sentence-aware)
 # -----------------------------
 def chunk_text(text: str, chunk_size: int = 800, overlap: int = 150) -> list:
     """
-    Splits extracted text into meaningful, overlapping, sentence-based chunks.
-    Optimized for Hugging Face Spaces (low memory & local inference).
     Args:
-        text (str): Extracted document text.
         chunk_size (int): Max characters per chunk (default: 800).
-        overlap (int): Overlapping characters between chunks (default: 150).
     Returns:
-        list[str]: List of text chunks.
     """
-    # Step 1. Clean and normalize whitespace
     text = re.sub(r'\s+', ' ', text.strip())
-    # Step 2. Split into sentences (simple but effective heuristic)
     sentences = re.split(r'(?<=[.!?])\s+', text)
-    chunks = []
-    current_chunk = ""
-    # Step 3. Build chunks by adding sentences until limit is reached
-    for sentence in sentences:
-        if len(current_chunk) + len(sentence) + 1 <= chunk_size:
-            current_chunk += " " + sentence
         else:
-            # Save completed chunk
-            if current_chunk.strip():
-                chunks.append(current_chunk.strip())
-            # Create overlap (for context continuity)
-            overlap_part = current_chunk[-overlap:] if overlap > 0 else ""
-            current_chunk = overlap_part + " " + sentence
-    # Step 4. Add final chunk
-    if current_chunk.strip():
-        chunks.append(current_chunk.strip())
     return chunks
 # -----------------------------
-# OPTIONAL DEBUG / SANITY CHECK
 # -----------------------------
 if __name__ == "__main__":
-    # Quick local test
     sample_text = """
-    Artificial Intelligence is transforming industries.
     Machine learning is a key subfield, driving automation and predictive analytics.
     Neural networks power most modern AI applications today.
     """
     chunks = chunk_text(sample_text, chunk_size=80, overlap=20)
-    print("Chunks created:", len(chunks))
     for i, c in enumerate(chunks, 1):
         print(f"\n--- Chunk {i} ({len(c)} chars) ---\n{c}")

 import fitz  # PyMuPDF
 # -----------------------------
+# TEXT EXTRACTION (Robust)
 # -----------------------------
 def extract_text_from_pdf(file_path: str) -> str:
     """
+    Extracts and cleans text from a PDF using PyMuPDF.
+    Handles both textual and scanned PDFs gracefully.
     Args:
         file_path (str): Path to the PDF file.
     Returns:
+        str: Combined extracted text.
     """
     text = ""
+    try:
+        with fitz.open(file_path) as pdf:
+            for page in pdf:
+                page_text = page.get_text("text").strip()
+                if not page_text:
+                    # Fallback: extract raw blocks (helps with weird PDFs)
+                    blocks = page.get_text("blocks")
+                    page_text = " ".join(block[4] for block in blocks if isinstance(block[4], str))
+                text += page_text + "\n"
+    except Exception as e:
+        raise RuntimeError(f"❌ PDF extraction failed: {e}")
+    # Clean out any extra whitespace or control characters
+    text = re.sub(r'\s+', ' ', text).strip()
     return text
 # -----------------------------
+# SMART CHUNKING (Context Aware)
 # -----------------------------
 def chunk_text(text: str, chunk_size: int = 800, overlap: int = 150) -> list:
     """
+    Splits text into overlapping, sentence-based chunks.
+    Optimized for embedding models (E5, MiniLM, etc.) for semantic retrieval.
     Args:
+        text (str): Input text.
         chunk_size (int): Max characters per chunk (default: 800).
+        overlap (int): Overlapping characters for continuity (default: 150).
     Returns:
+        list[str]: Chunked text segments.
     """
+    # Clean text once
     text = re.sub(r'\s+', ' ', text.strip())
+    # Sentence segmentation (simple rule-based, fast)
     sentences = re.split(r'(?<=[.!?])\s+', text)
+    chunks, current = [], ""
+    for sent in sentences:
+        if len(current) + len(sent) + 1 <= chunk_size:
+            current += " " + sent
         else:
+            # Store full chunk
+            if current.strip():
+                chunks.append(current.strip())
+            # Overlap control
+            overlap_part = current[-overlap:] if overlap > 0 else ""
+            current = overlap_part + " " + sent
+    # Append the last chunk
+    if current.strip():
+        chunks.append(current.strip())
     return chunks
 # -----------------------------
+# DEBUGGING (Manual Run)
 # -----------------------------
 if __name__ == "__main__":
     sample_text = """
+    Artificial Intelligence is transforming industries.
     Machine learning is a key subfield, driving automation and predictive analytics.
     Neural networks power most modern AI applications today.
+    This technology is reshaping healthcare, finance, and manufacturing.
     """
     chunks = chunk_text(sample_text, chunk_size=80, overlap=20)
+    print(f"✅ Chunks created: {len(chunks)}")
     for i, c in enumerate(chunks, 1):
         print(f"\n--- Chunk {i} ({len(c)} chars) ---\n{c}")