Spaces:
Sleeping
Sleeping
| # upload_ingest.py | |
| from typing import List, Dict, Optional | |
| from pathlib import Path | |
| def extract_text_from_files(paths: Optional[List[str]]) -> Dict[str, List[str]]: | |
| """Return text chunks for RAG. Safe on None/empty.""" | |
| paths = paths or [] | |
| chunks: List[str] = [] | |
| artifacts: List[Dict] = [] | |
| for p in paths: | |
| ext = Path(p).suffix.lower() | |
| if ext in {".txt", ".md"}: | |
| try: | |
| with open(p, "r", encoding="utf-8", errors="ignore") as f: | |
| text = f.read() | |
| for i in range(0, len(text), 1500): | |
| chunks.append(text[i:i+1500]) | |
| artifacts.append({"path": p, "type": ext}) | |
| except Exception: | |
| pass | |
| # Add PDF parsing later if needed | |
| return {"chunks": chunks, "artifacts": artifacts} | |