File size: 822 Bytes
e9fc2e1
 
ef17f73
e9fc2e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
521ffa1
e9fc2e1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
from typing import List, Dict, Optional
from pathlib import Path

def extract_text_from_files(paths: Optional[List[str]]) -> Dict[str, List[str]]:
    """Return text chunks for RAG. Safe on None/empty."""
    paths = paths or []
    chunks: List[str] = []
    artifacts: List[Dict] = []
    for p in paths:
        ext = Path(p).suffix.lower()
        if ext in {".txt", ".md"}:
            try:
                with open(p, "r", encoding="utf-8", errors="ignore") as f:
                    text = f.read()
                for i in range(0, len(text), 1500):
                    chunks.append(text[i:i+1500])
                artifacts.append({"path": p, "type": ext})
            except Exception:
                pass
        # (PDF/Docx parsing can be added later.)
    return {"chunks": chunks, "artifacts": artifacts}