Corin1998 commited on
Commit
a141462
·
verified ·
1 Parent(s): e69610e

Update pipelines/utils.py

Browse files
Files changed (1) hide show
  1. pipelines/utils.py +10 -9
pipelines/utils.py CHANGED
@@ -1,7 +1,8 @@
 
1
  import io
2
- import decx
3
 
4
- def detect_filetype(filename: str, file_btes: bytes) -> str:
5
  fname = (filename or "").lower()
6
  if fname.endswith(".pdf"):
7
  return "pdf"
@@ -10,18 +11,18 @@ def detect_filetype(filename: str, file_btes: bytes) -> str:
10
  if fname.endswith(".docx"):
11
  return "docx"
12
  if fname.endswith(".txt"):
13
- return "text"
14
  if file_bytes[:4] == b"%PDF":
15
  return "pdf"
16
  return "unknown"
17
 
18
- def load_text(filetype: str, file_bytes: bytes) -> str:
19
- if filestype == "docx":
20
  f = io.BytesIO(file_bytes)
21
- doc = decx.Document(f)
22
- return "\n".join(p.text for p in doc.paragraphs)
23
- elif filetype == "text":
24
  return file_bytes.decode("utf-8", errors="ignore")
25
  else:
26
- # それ以外は上位でOpenAI側へルーティング
27
  return ""
 
1
+ # pipelines/utils.py
2
  import io
3
+ import docx
4
 
5
+ def detect_filetype(filename: str, file_bytes: bytes) -> str:
6
  fname = (filename or "").lower()
7
  if fname.endswith(".pdf"):
8
  return "pdf"
 
11
  if fname.endswith(".docx"):
12
  return "docx"
13
  if fname.endswith(".txt"):
14
+ return "txt"
15
  if file_bytes[:4] == b"%PDF":
16
  return "pdf"
17
  return "unknown"
18
 
19
+ def load_doc_text(filetype: str, file_bytes: bytes) -> str:
20
+ if filetype == "docx":
21
  f = io.BytesIO(file_bytes)
22
+ doc = docx.Document(f)
23
+ return "\n".join([p.text for p in doc.paragraphs])
24
+ elif filetype == "txt":
25
  return file_bytes.decode("utf-8", errors="ignore")
26
  else:
27
+ # それ以外は上位でOpenAI側へルーティング
28
  return ""