AptlyDigital commited on
Commit
68e634d
·
verified ·
1 Parent(s): b18f16f

Upload 3 files

Browse files
documents/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Empty init files
documents/chunker.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+
3
+ class DocumentChunker:
4
+ def __init__(self, chunk_size=500, chunk_overlap=50):
5
+ self.splitter = RecursiveCharacterTextSplitter(
6
+ chunk_size=chunk_size,
7
+ chunk_overlap=chunk_overlap
8
+ )
9
+
10
+ def chunk_text(self, text):
11
+ return self.splitter.split_text(text)
documents/loader.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import PyPDF2
2
+ from PIL import Image
3
+ import pytesseract
4
+
5
+ class DocumentLoader:
6
+ def load_pdf(self, file_path):
7
+ text = ""
8
+ with open(file_path, 'rb') as file:
9
+ pdf_reader = PyPDF2.PdfReader(file)
10
+ for page in pdf_reader.pages:
11
+ text += page.extract_text()
12
+ return text
13
+
14
+ def load_image(self, file_path):
15
+ image = Image.open(file_path)
16
+ text = pytesseract.image_to_string(image)
17
+ return text