Spaces:

oluinioluwa814
/

RAG

Sleeping

App Files Files Community

oluinioluwa814 commited on Dec 11, 2025

Commit

eec8412

verified ·

1 Parent(s): 64b893d

Update utils.py

Browse files

Files changed (1) hide show

utils.py +25 -45

utils.py CHANGED Viewed

@@ -1,52 +1,32 @@
-## File: `utils.py`
-"""Helper utilities: text extraction, chunking, safe getters."""
-from typing import List
-import textract
 from pathlib import Path
-import os
-def extract_text_from_file(path: str) -> str:
-    """Extract text from PDF, DOCX, TXT and other files using textract.
-    Returns a unicode string. If extraction fails, returns empty string.
     """
-    try:
-        path_obj = Path(path)
-        if not path_obj.exists():
-            return ""
-        text = textract.process(str(path_obj))
-        if isinstance(text, bytes):
-            text = text.decode('utf-8', errors='ignore')
-        return text
-    except Exception as e:
-        print(f"[extract_text_from_file] Error extracting {path}: {e}")
-        return ""
-def chunk_text(text: str, chunk_size: int = 512, overlap: int = 50) -> List[str]:
-    """Split text into overlapping chunks."""
-    if not text:
-        return []
-    tokens = text.split()
-    chunks = []
-    start = 0
-    while start < len(tokens):
-        end = min(start + chunk_size, len(tokens))
-        chunk = " ".join(tokens[start:end])
-        chunks.append(chunk)
-        if end == len(tokens):
-            break
-        start = end - overlap
-    return chunks
-def safe_get(d: dict, path: list, default=None):
-    """Safely traverse nested dict/list. path is list of keys/indexes."""
-    cur = d
-    try:
-        for p in path:
-            cur = cur[p]
-        return cur
-    except Exception:
-        return default

+import PyPDF2
+import docx
 from pathlib import Path
+def load_text(path: str) -> str:
     """
+    Load text from TXT, PDF, or DOCX files.
+    Returns the extracted text as a string.
+    """
+    path_obj = Path(path)
+    if not path_obj.exists():
+        raise FileNotFoundError(f"{path} does not exist.")
+    if path_obj.suffix.lower() == ".txt":
+        return path_obj.read_text(encoding="utf-8")
+    elif path_obj.suffix.lower() == ".pdf":
+        text = ""
+        with open(path_obj, "rb") as f:
+            reader = PyPDF2.PdfReader(f)
+            for page in reader.pages:
+                page_text = page.extract_text()
+                if page_text:
+                    text += page_text + "\n"
+        return text
+    elif path_obj.suffix.lower() == ".docx":
+        doc = docx.Document(path_obj)
+        return "\n".join([p.text for p in doc.paragraphs])
+    else:
+        raise ValueError(f"Unsupported file type: {path_obj.suffix}")