|
|
import fitz |
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
import docx |
|
|
import os |
|
|
|
|
|
def parse_pdf(file_path: str) -> str: |
|
|
"""Extracts text from a PDF file.""" |
|
|
doc = fitz.open(file_path) |
|
|
text = "" |
|
|
for page in doc: |
|
|
text += page.get_text() |
|
|
doc.close() |
|
|
return text |
|
|
|
|
|
def parse_txt(file_path: str) -> str: |
|
|
"""Extracts text from a TXT file.""" |
|
|
with open(file_path, 'r', encoding='utf-8') as f: |
|
|
text = f.read() |
|
|
return text |
|
|
|
|
|
def parse_docx(file_path: str) -> str: |
|
|
"""Extracts text from a DOCX file.""" |
|
|
doc = docx.Document(file_path) |
|
|
text = "" |
|
|
for paragraph in doc.paragraphs: |
|
|
text += paragraph.text + "\n" |
|
|
return text |
|
|
|
|
|
def parse_document(file_path: str, file_type: str) -> str: |
|
|
"""Parse document based on file type.""" |
|
|
if file_type == "pdf": |
|
|
return parse_pdf(file_path) |
|
|
elif file_type == "txt": |
|
|
return parse_txt(file_path) |
|
|
elif file_type == "docx": |
|
|
return parse_docx(file_path) |
|
|
else: |
|
|
raise ValueError(f"Unsupported file type: {file_type}") |
|
|
|
|
|
def chunk_text(text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> list: |
|
|
"""Split text into chunks using LangChain's text splitter.""" |
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
|
chunk_size=chunk_size, |
|
|
chunk_overlap=chunk_overlap, |
|
|
length_function=len, |
|
|
) |
|
|
chunks = text_splitter.split_text(text) |
|
|
return chunks |
|
|
|
|
|
def get_embedding_model(): |
|
|
"""Return None since we're using Gemini API for embeddings.""" |
|
|
|
|
|
|
|
|
return None |