File size: 1,259 Bytes
29fbb51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from io import BytesIO
from fastapi import UploadFile, HTTPException
import PyPDF2
import docx

async def extract_text_from_file(file: UploadFile) -> str:
    """Extracts text from various file types."""
    content = await file.read()
    file_stream = BytesIO(content)

    if file.content_type == "application/pdf":
        return extract_text_from_pdf(file_stream)
    elif file.content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
        return extract_text_from_docx(file_stream)
    elif file.content_type == "text/plain":
        return file_stream.read().decode("utf-8")
    else:
        raise HTTPException(
            status_code=415,
            detail="Unsupported file type. Please upload a .pdf, .docx, or .txt file."
        )

def extract_text_from_pdf(file_stream: BytesIO) -> str:
    """Extracts text from a PDF file."""
    reader = PyPDF2.PdfReader(file_stream)
    text = ""
    for page in reader.pages:
        text += page.extract_text() or ""
    return text

def extract_text_from_docx(file_stream: BytesIO) -> str:
    """Extracts text from a DOCX file."""
    doc = docx.Document(file_stream)
    text = ""
    for para in doc.paragraphs:
        text += para.text + "\n"
    return text