Rajan Sharma commited on
Commit
e9fc2e1
·
verified ·
1 Parent(s): 6c48427

Update upload_ingest.py

Browse files
Files changed (1) hide show
  1. upload_ingest.py +20 -7
upload_ingest.py CHANGED
@@ -1,9 +1,22 @@
1
  # upload_ingest.py
2
- def extract_text_from_files(files):
3
- chunks=[]
4
- for f in files:
5
- if f.endswith(".txt"):
6
- with open(f,"r") as fh:
7
- chunks.append(fh.read())
8
- return {"chunks":chunks}
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # upload_ingest.py
2
+ from typing import List, Dict, Optional
3
+ from pathlib import Path
 
 
 
 
 
4
 
5
+ def extract_text_from_files(paths: Optional[List[str]]) -> Dict[str, List[str]]:
6
+ """Return text chunks for RAG. Safe on None/empty."""
7
+ paths = paths or []
8
+ chunks: List[str] = []
9
+ artifacts: List[Dict] = []
10
+ for p in paths:
11
+ ext = Path(p).suffix.lower()
12
+ if ext in {".txt", ".md"}:
13
+ try:
14
+ with open(p, "r", encoding="utf-8", errors="ignore") as f:
15
+ text = f.read()
16
+ for i in range(0, len(text), 1500):
17
+ chunks.append(text[i:i+1500])
18
+ artifacts.append({"path": p, "type": ext})
19
+ except Exception:
20
+ pass
21
+ # Add PDF parsing later if needed
22
+ return {"chunks": chunks, "artifacts": artifacts}