Spaces:
Runtime error
Runtime error
Upload 3 files
Browse files- documents/__init__.py +1 -0
- documents/chunker.py +11 -0
- documents/loader.py +17 -0
documents/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Empty init files
|
documents/chunker.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 2 |
+
|
| 3 |
+
class DocumentChunker:
|
| 4 |
+
def __init__(self, chunk_size=500, chunk_overlap=50):
|
| 5 |
+
self.splitter = RecursiveCharacterTextSplitter(
|
| 6 |
+
chunk_size=chunk_size,
|
| 7 |
+
chunk_overlap=chunk_overlap
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
def chunk_text(self, text):
|
| 11 |
+
return self.splitter.split_text(text)
|
documents/loader.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import PyPDF2
|
| 2 |
+
from PIL import Image
|
| 3 |
+
import pytesseract
|
| 4 |
+
|
| 5 |
+
class DocumentLoader:
|
| 6 |
+
def load_pdf(self, file_path):
|
| 7 |
+
text = ""
|
| 8 |
+
with open(file_path, 'rb') as file:
|
| 9 |
+
pdf_reader = PyPDF2.PdfReader(file)
|
| 10 |
+
for page in pdf_reader.pages:
|
| 11 |
+
text += page.extract_text()
|
| 12 |
+
return text
|
| 13 |
+
|
| 14 |
+
def load_image(self, file_path):
|
| 15 |
+
image = Image.open(file_path)
|
| 16 |
+
text = pytesseract.image_to_string(image)
|
| 17 |
+
return text
|