oluinioluwa814 commited on
Commit
eec8412
·
verified ·
1 Parent(s): 64b893d

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +25 -45
utils.py CHANGED
@@ -1,52 +1,32 @@
1
-
2
- ## File: `utils.py`
3
- """Helper utilities: text extraction, chunking, safe getters."""
4
- from typing import List
5
- import textract
6
  from pathlib import Path
7
- import os
8
 
9
- def extract_text_from_file(path: str) -> str:
10
- """Extract text from PDF, DOCX, TXT and other files using textract.
11
- Returns a unicode string. If extraction fails, returns empty string.
12
  """
13
- try:
14
- path_obj = Path(path)
15
- if not path_obj.exists():
16
- return ""
17
- text = textract.process(str(path_obj))
18
- if isinstance(text, bytes):
19
- text = text.decode('utf-8', errors='ignore')
20
- return text
21
- except Exception as e:
22
- print(f"[extract_text_from_file] Error extracting {path}: {e}")
23
- return ""
24
 
 
 
25
 
26
- def chunk_text(text: str, chunk_size: int = 512, overlap: int = 50) -> List[str]:
27
- """Split text into overlapping chunks."""
28
- if not text:
29
- return []
30
- tokens = text.split()
31
- chunks = []
32
- start = 0
33
- while start < len(tokens):
34
- end = min(start + chunk_size, len(tokens))
35
- chunk = " ".join(tokens[start:end])
36
- chunks.append(chunk)
37
- if end == len(tokens):
38
- break
39
- start = end - overlap
40
- return chunks
41
 
 
 
 
42
 
43
- def safe_get(d: dict, path: list, default=None):
44
- """Safely traverse nested dict/list. path is list of keys/indexes."""
45
- cur = d
46
- try:
47
- for p in path:
48
- cur = cur[p]
49
- return cur
50
- except Exception:
51
- return default
52
-
 
1
+ import PyPDF2
2
+ import docx
 
 
 
3
  from pathlib import Path
 
4
 
5
+ def load_text(path: str) -> str:
 
 
6
  """
7
+ Load text from TXT, PDF, or DOCX files.
8
+ Returns the extracted text as a string.
9
+ """
10
+ path_obj = Path(path)
11
+ if not path_obj.exists():
12
+ raise FileNotFoundError(f"{path} does not exist.")
 
 
 
 
 
13
 
14
+ if path_obj.suffix.lower() == ".txt":
15
+ return path_obj.read_text(encoding="utf-8")
16
 
17
+ elif path_obj.suffix.lower() == ".pdf":
18
+ text = ""
19
+ with open(path_obj, "rb") as f:
20
+ reader = PyPDF2.PdfReader(f)
21
+ for page in reader.pages:
22
+ page_text = page.extract_text()
23
+ if page_text:
24
+ text += page_text + "\n"
25
+ return text
 
 
 
 
 
 
26
 
27
+ elif path_obj.suffix.lower() == ".docx":
28
+ doc = docx.Document(path_obj)
29
+ return "\n".join([p.text for p in doc.paragraphs])
30
 
31
+ else:
32
+ raise ValueError(f"Unsupported file type: {path_obj.suffix}")