File size: 880 Bytes
a141462
c8b6d3b
a141462
c8b6d3b
a141462
c8b6d3b
 
 
 
 
 
 
 
a141462
c8b6d3b
 
 
 
a141462
 
c8b6d3b
a141462
 
 
c8b6d3b
 
a141462
c8b6d3b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# pipelines/utils.py
import io
import docx

def detect_filetype(filename: str, file_bytes: bytes) -> str:
    fname = (filename or "").lower()
    if fname.endswith(".pdf"):
        return "pdf"
    if any(fname.endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".tiff", ".bmp"]):
        return "image"
    if fname.endswith(".docx"):
        return "docx"
    if fname.endswith(".txt"):
        return "txt"
    if file_bytes[:4] == b"%PDF":
        return "pdf"
    return "unknown"

def load_doc_text(filetype: str, file_bytes: bytes) -> str:
    if filetype == "docx":
        f = io.BytesIO(file_bytes)
        doc = docx.Document(f)
        return "\n".join([p.text for p in doc.paragraphs])
    elif filetype == "txt":
        return file_bytes.decode("utf-8", errors="ignore")
    else:
        # それ以外は上位でOpenAI側へルーティング
        return ""