File size: 1,384 Bytes
a53dc0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import os
import docx
import pdfplumber
import nltk

nltk.download("punkt")
from nltk.tokenize import sent_tokenize

def extract_text_from_docx(path):
    try:
        doc = docx.Document(path)
        paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
        return paragraphs if paragraphs else []
    except Exception as e:
        print(f"❌ Failed to extract DOCX: {e}")
        return []

def extract_text_from_pdf(path):
    try:
        with pdfplumber.open(path) as pdf:
            all_text = "\n".join(
                page.extract_text() for page in pdf.pages if page.extract_text()
            )
    except Exception as e:
        print(f"❌ Failed to extract PDF: {e}")
        return []

    if not all_text.strip():
        return []

    # Try splitting by paragraphs
    paragraphs = [p.strip() for p in all_text.split("\n\n") if p.strip()]
    if paragraphs:
        return paragraphs

    # Fallback: break into 3–5 sentence chunks
    sentences = sent_tokenize(all_text)
    return [" ".join(sentences[i:i + 5]) for i in range(0, len(sentences), 5)]

def extract_paragraphs(path):
    ext = os.path.splitext(path)[-1].lower()
    if ext == ".docx":
        return extract_text_from_docx(path)
    elif ext == ".pdf":
        return extract_text_from_pdf(path)
    else:
        raise ValueError(f"Unsupported file type: {ext}")