Spaces:
Sleeping
Sleeping
Rajan Sharma
commited on
Update upload_ingest.py
Browse files- upload_ingest.py +20 -7
upload_ingest.py
CHANGED
|
@@ -1,9 +1,22 @@
|
|
| 1 |
# upload_ingest.py
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
for f in files:
|
| 5 |
-
if f.endswith(".txt"):
|
| 6 |
-
with open(f,"r") as fh:
|
| 7 |
-
chunks.append(fh.read())
|
| 8 |
-
return {"chunks":chunks}
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# upload_ingest.py
|
| 2 |
+
from typing import List, Dict, Optional
|
| 3 |
+
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
+
def extract_text_from_files(paths: Optional[List[str]]) -> Dict[str, List[str]]:
|
| 6 |
+
"""Return text chunks for RAG. Safe on None/empty."""
|
| 7 |
+
paths = paths or []
|
| 8 |
+
chunks: List[str] = []
|
| 9 |
+
artifacts: List[Dict] = []
|
| 10 |
+
for p in paths:
|
| 11 |
+
ext = Path(p).suffix.lower()
|
| 12 |
+
if ext in {".txt", ".md"}:
|
| 13 |
+
try:
|
| 14 |
+
with open(p, "r", encoding="utf-8", errors="ignore") as f:
|
| 15 |
+
text = f.read()
|
| 16 |
+
for i in range(0, len(text), 1500):
|
| 17 |
+
chunks.append(text[i:i+1500])
|
| 18 |
+
artifacts.append({"path": p, "type": ext})
|
| 19 |
+
except Exception:
|
| 20 |
+
pass
|
| 21 |
+
# Add PDF parsing later if needed
|
| 22 |
+
return {"chunks": chunks, "artifacts": artifacts}
|