File size: 1,495 Bytes
d9e9326
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# app/core/utils.py
from pathlib import Path
from typing import Union
from pypdf import PdfReader


def extract_text(file_path: Union[str, Path]) -> str:
    """
    Extracts text from a PDF or plain text file.
    Supports: .pdf, .txt, .md
    Returns a single cleaned text string.
    """
    file_path = Path(file_path)
    suffix = file_path.suffix.lower()

    if suffix == ".pdf":
        return _extract_from_pdf(file_path)
    elif suffix in {".txt", ".md"}:
        return _extract_from_txt(file_path)
    else:
        raise ValueError(f"Unsupported file type: {suffix}")


def _extract_from_pdf(file_path: Path) -> str:
    """Reads all text from a PDF using PyPDF."""
    try:
        reader = PdfReader(str(file_path))
        pages = [page.extract_text() or "" for page in reader.pages]
        text = "\n".join(pages)
        return _clean_text(text)
    except Exception as e:
        print(f"[WARN] Error reading PDF: {e}")
        return ""


def _extract_from_txt(file_path: Path) -> str:
    """Reads all text from a plain text or markdown file."""
    try:
        text = file_path.read_text(encoding="utf-8", errors="ignore")
        return _clean_text(text)
    except Exception as e:
        print(f"[WARN] Error reading TXT: {e}")
        return ""


def _clean_text(text: str) -> str:
    """Basic cleaning to remove extra spaces, tabs, and empty lines."""
    text = text.replace("\r", " ").replace("\n", " ")
    text = " ".join(text.split())
    return text.strip()