File size: 1,650 Bytes
eefb354 df842c7 eefb354 1d9404d eefb354 1d9404d eefb354 1d9404d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
import fitz # PyMuPDF
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import docx # Added for docx parsing
def parse_pdf(file_path: str) -> str:
"""Extracts text from a PDF file."""
doc = fitz.open(file_path)
text = ""
for page in doc:
text += page.get_text()
doc.close()
return text
def parse_txt(file_path: str) -> str:
"""Extracts text from a TXT file."""
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
return text
def parse_docx(file_path: str) -> str:
"""Extracts text from a DOCX file."""
document = docx.Document(file_path)
text = []
for paragraph in document.paragraphs:
text.append(paragraph.text)
return '\n'.join(text)
def parse_document(file_path: str, file_extension: str) -> str:
"""Dispatches to the correct parser based on file extension."""
if file_extension == ".pdf":
return parse_pdf(file_path)
elif file_extension == ".txt":
return parse_txt(file_path)
elif file_extension == ".docx":
return parse_docx(file_path)
else:
raise ValueError(f"Unsupported file type: {file_extension}")
def chunk_text(text: str) -> list[str]:
"""Splits text into smaller chunks."""
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len
)
return text_splitter.split_text(text)
def get_embedding_model(model_name: str = 'all-MiniLM-L6-v2'):
"""Loads the sentence-transformer model."""
return SentenceTransformer(model_name) |