File size: 1,710 Bytes
4cab845 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import fitz # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter
import docx # Added for docx parsing
import os
def parse_pdf(file_path: str) -> str:
"""Extracts text from a PDF file."""
doc = fitz.open(file_path)
text = ""
for page in doc:
text += page.get_text()
doc.close()
return text
def parse_txt(file_path: str) -> str:
"""Extracts text from a TXT file."""
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
return text
def parse_docx(file_path: str) -> str:
"""Extracts text from a DOCX file."""
doc = docx.Document(file_path)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text
def parse_document(file_path: str, file_type: str) -> str:
"""Parse document based on file type."""
if file_type == "pdf":
return parse_pdf(file_path)
elif file_type == "txt":
return parse_txt(file_path)
elif file_type == "docx":
return parse_docx(file_path)
else:
raise ValueError(f"Unsupported file type: {file_type}")
def chunk_text(text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> list:
"""Split text into chunks using LangChain's text splitter."""
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
)
chunks = text_splitter.split_text(text)
return chunks
def get_embedding_model():
"""Return None since we're using Gemini API for embeddings."""
# This function is kept for compatibility but returns None
# since we're using Gemini API for embeddings
return None |