Shubham170793 commited on
Commit
275cb5c
·
verified ·
1 Parent(s): 4d99eea

Create ingestion.py

Browse files
Files changed (1) hide show
  1. src/ingestion.py +43 -0
src/ingestion.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+
4
+
5
+ def extract_text_from_pdf(file_path: str) -> str:
6
+ """
7
+ Extracts text from a PDF file using PyMuPDF.
8
+
9
+ Args:
10
+ file_path (str): Path to the PDF file.
11
+
12
+ Returns:
13
+ str: The extracted text from the PDF.
14
+ """
15
+ text = ""
16
+ # Open the PDF
17
+ with fitz.open(file_path) as pdf:
18
+ # Loop through each page
19
+ for page in pdf:
20
+ # Extract text from that page
21
+ text += page.get_text()
22
+ return text
23
+
24
+
25
+
26
+
27
+ def chunk_text(text: str, chunk_size: int = 500, chunk_overlap: int = 50) -> list:
28
+ """
29
+ Splits extracted text into smaller overlapping chunks.
30
+
31
+ Args:
32
+ text (str): The full extracted text.
33
+ chunk_size (int): Max characters per chunk.
34
+ chunk_overlap (int): Overlap between chunks.
35
+
36
+ Returns:
37
+ list: List of text chunks.
38
+ """
39
+ splitter = RecursiveCharacterTextSplitter(
40
+ chunk_size=chunk_size,
41
+ chunk_overlap=chunk_overlap
42
+ )
43
+ return splitter.split_text(text)