File size: 1,650 Bytes
eefb354
df842c7
eefb354
1d9404d
eefb354
 
 
 
 
 
 
 
 
 
1d9404d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eefb354
 
 
 
 
 
 
 
 
 
 
1d9404d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import fitz  # PyMuPDF
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import docx # Added for docx parsing

def parse_pdf(file_path: str) -> str:
    """Extracts text from a PDF file."""
    doc = fitz.open(file_path)
    text = ""
    for page in doc:
        text += page.get_text()
    doc.close()
    return text

def parse_txt(file_path: str) -> str:
    """Extracts text from a TXT file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    return text

def parse_docx(file_path: str) -> str:
    """Extracts text from a DOCX file."""
    document = docx.Document(file_path)
    text = []
    for paragraph in document.paragraphs:
        text.append(paragraph.text)
    return '\n'.join(text)

def parse_document(file_path: str, file_extension: str) -> str:
    """Dispatches to the correct parser based on file extension."""
    if file_extension == ".pdf":
        return parse_pdf(file_path)
    elif file_extension == ".txt":
        return parse_txt(file_path)
    elif file_extension == ".docx":
        return parse_docx(file_path)
    else:
        raise ValueError(f"Unsupported file type: {file_extension}")

def chunk_text(text: str) -> list[str]:
    """Splits text into smaller chunks."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    return text_splitter.split_text(text)

def get_embedding_model(model_name: str = 'all-MiniLM-L6-v2'):
    """Loads the sentence-transformer model."""
    return SentenceTransformer(model_name)