File size: 1,229 Bytes
959439e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import fitz
import docx
from io import BytesIO

async def extract_text(file):
    filename = (file.filename or "").lower()
    contents = await file.read()
    if not contents:
        raise ValueError("Uploaded file is empty.")
    
    if filename.endswith(".pdf"):
        pdf = fitz.open(stream=contents, filetype="pdf")
        text = ""
        for page in pdf:
            text += page.get_text()
        if not text.strip():
            raise ValueError("Could not extract text from PDF.")
        return text
    
    elif filename.endswith(".docx"):
        doc = docx.Document(BytesIO(contents))
        text = ""
        for para in doc.paragraphs:
            text += para.text + "\n"
        if not text.strip():
            raise ValueError("Could not extract text from DOCX.")
        return text
    
    elif filename.endswith(".txt"):
        try:
            text = contents.decode("utf-8")
        except UnicodeDecodeError as e:
            raise ValueError("TXT file must be UTF-8 encoded.") from e
        if not text.strip():
            raise ValueError("Uploaded TXT file is empty.")
        return text
    
    else:
        raise ValueError("Unsupported file type. Please upload PDF, DOCX, or TXT.")