Shubham170793 commited on
Commit
6b0c8b8
·
verified ·
1 Parent(s): 6403c55

Update src/ingestion.py

Browse files
Files changed (1) hide show
  1. src/ingestion.py +43 -34
src/ingestion.py CHANGED
@@ -2,81 +2,90 @@ import re
2
  import fitz # PyMuPDF
3
 
4
  # -----------------------------
5
- # TEXT EXTRACTION
6
  # -----------------------------
7
  def extract_text_from_pdf(file_path: str) -> str:
8
  """
9
- Extracts text from a PDF file using PyMuPDF.
 
10
 
11
  Args:
12
  file_path (str): Path to the PDF file.
13
-
14
  Returns:
15
- str: The extracted text from the PDF.
16
  """
17
  text = ""
18
- with fitz.open(file_path) as pdf:
19
- for page in pdf:
20
- text += page.get_text("text") # Extracts text from each page
 
 
 
 
 
 
 
 
 
 
 
21
  return text
22
 
23
 
24
  # -----------------------------
25
- # SMART CHUNKING (sentence-aware)
26
  # -----------------------------
27
  def chunk_text(text: str, chunk_size: int = 800, overlap: int = 150) -> list:
28
  """
29
- Splits extracted text into meaningful, overlapping, sentence-based chunks.
30
- Optimized for Hugging Face Spaces (low memory & local inference).
31
 
32
  Args:
33
- text (str): Extracted document text.
34
  chunk_size (int): Max characters per chunk (default: 800).
35
- overlap (int): Overlapping characters between chunks (default: 150).
36
 
37
  Returns:
38
- list[str]: List of text chunks.
39
  """
40
- # Step 1. Clean and normalize whitespace
41
  text = re.sub(r'\s+', ' ', text.strip())
42
 
43
- # Step 2. Split into sentences (simple but effective heuristic)
44
  sentences = re.split(r'(?<=[.!?])\s+', text)
45
 
46
- chunks = []
47
- current_chunk = ""
48
 
49
- # Step 3. Build chunks by adding sentences until limit is reached
50
- for sentence in sentences:
51
- if len(current_chunk) + len(sentence) + 1 <= chunk_size:
52
- current_chunk += " " + sentence
53
  else:
54
- # Save completed chunk
55
- if current_chunk.strip():
56
- chunks.append(current_chunk.strip())
57
 
58
- # Create overlap (for context continuity)
59
- overlap_part = current_chunk[-overlap:] if overlap > 0 else ""
60
- current_chunk = overlap_part + " " + sentence
61
 
62
- # Step 4. Add final chunk
63
- if current_chunk.strip():
64
- chunks.append(current_chunk.strip())
65
 
66
  return chunks
67
 
68
 
69
  # -----------------------------
70
- # OPTIONAL DEBUG / SANITY CHECK
71
  # -----------------------------
72
  if __name__ == "__main__":
73
- # Quick local test
74
  sample_text = """
75
- Artificial Intelligence is transforming industries.
76
  Machine learning is a key subfield, driving automation and predictive analytics.
77
  Neural networks power most modern AI applications today.
 
78
  """
79
  chunks = chunk_text(sample_text, chunk_size=80, overlap=20)
80
- print("Chunks created:", len(chunks))
81
  for i, c in enumerate(chunks, 1):
82
  print(f"\n--- Chunk {i} ({len(c)} chars) ---\n{c}")
 
2
  import fitz # PyMuPDF
3
 
4
  # -----------------------------
5
+ # TEXT EXTRACTION (Robust)
6
  # -----------------------------
7
  def extract_text_from_pdf(file_path: str) -> str:
8
  """
9
+ Extracts and cleans text from a PDF using PyMuPDF.
10
+ Handles both textual and scanned PDFs gracefully.
11
 
12
  Args:
13
  file_path (str): Path to the PDF file.
 
14
  Returns:
15
+ str: Combined extracted text.
16
  """
17
  text = ""
18
+ try:
19
+ with fitz.open(file_path) as pdf:
20
+ for page in pdf:
21
+ page_text = page.get_text("text").strip()
22
+ if not page_text:
23
+ # Fallback: extract raw blocks (helps with weird PDFs)
24
+ blocks = page.get_text("blocks")
25
+ page_text = " ".join(block[4] for block in blocks if isinstance(block[4], str))
26
+ text += page_text + "\n"
27
+ except Exception as e:
28
+ raise RuntimeError(f"❌ PDF extraction failed: {e}")
29
+
30
+ # Clean out any extra whitespace or control characters
31
+ text = re.sub(r'\s+', ' ', text).strip()
32
  return text
33
 
34
 
35
  # -----------------------------
36
+ # SMART CHUNKING (Context Aware)
37
  # -----------------------------
38
  def chunk_text(text: str, chunk_size: int = 800, overlap: int = 150) -> list:
39
  """
40
+ Splits text into overlapping, sentence-based chunks.
41
+ Optimized for embedding models (E5, MiniLM, etc.) for semantic retrieval.
42
 
43
  Args:
44
+ text (str): Input text.
45
  chunk_size (int): Max characters per chunk (default: 800).
46
+ overlap (int): Overlapping characters for continuity (default: 150).
47
 
48
  Returns:
49
+ list[str]: Chunked text segments.
50
  """
51
+ # Clean text once
52
  text = re.sub(r'\s+', ' ', text.strip())
53
 
54
+ # Sentence segmentation (simple rule-based, fast)
55
  sentences = re.split(r'(?<=[.!?])\s+', text)
56
 
57
+ chunks, current = [], ""
 
58
 
59
+ for sent in sentences:
60
+ if len(current) + len(sent) + 1 <= chunk_size:
61
+ current += " " + sent
 
62
  else:
63
+ # Store full chunk
64
+ if current.strip():
65
+ chunks.append(current.strip())
66
 
67
+ # Overlap control
68
+ overlap_part = current[-overlap:] if overlap > 0 else ""
69
+ current = overlap_part + " " + sent
70
 
71
+ # Append the last chunk
72
+ if current.strip():
73
+ chunks.append(current.strip())
74
 
75
  return chunks
76
 
77
 
78
  # -----------------------------
79
+ # DEBUGGING (Manual Run)
80
  # -----------------------------
81
  if __name__ == "__main__":
 
82
  sample_text = """
83
+ Artificial Intelligence is transforming industries.
84
  Machine learning is a key subfield, driving automation and predictive analytics.
85
  Neural networks power most modern AI applications today.
86
+ This technology is reshaping healthcare, finance, and manufacturing.
87
  """
88
  chunks = chunk_text(sample_text, chunk_size=80, overlap=20)
89
+ print(f"Chunks created: {len(chunks)}")
90
  for i, c in enumerate(chunks, 1):
91
  print(f"\n--- Chunk {i} ({len(c)} chars) ---\n{c}")