GirishaBuilds01 commited on
Commit
b9fe585
·
verified ·
1 Parent(s): 58de391

Create extraction.py

Browse files
Files changed (1) hide show
  1. extraction.py +21 -0
extraction.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pypdf import PdfReader
2
+ from utils import chunk_text
3
+ from config import CHUNK_SIZE, CHUNK_OVERLAP
4
+
5
+
6
+ def extract_text_from_pdf(file_path):
7
+ reader = PdfReader(file_path)
8
+ full_text = ""
9
+
10
+ for i, page in enumerate(reader.pages):
11
+ text = page.extract_text()
12
+ if text:
13
+ full_text += f"\n\n--- Page {i+1} ---\n{text}"
14
+
15
+ return full_text
16
+
17
+
18
+ def process_pdf(file_path):
19
+ raw_text = extract_text_from_pdf(file_path)
20
+ chunks = chunk_text(raw_text, CHUNK_SIZE, CHUNK_OVERLAP)
21
+ return chunks